1 Set constants

# set style
theme_set(theme_bw())

# set figure directory
figure_dir <- here("analysis/figures")


# ground truth distributions for this experiment
true_dists <- data.frame(distribution = c("control", "treatment",
                                          "control", "treatment"),
                         mean = c(130, 120, 13, 9),
                         sd = c(12, 12, 4, 4),
                         medication_type = c('Blood pressure scenario', 'Blood pressure scenario',
                                         'COVID-19 scenario', 'COVID-19 scenario'))

# ground truth cohen's d and probability of superiority (auc) for this experiment
# note: exact empirical estimates for Cohen's D were c(1.28, 1.02), but using true population values
# these will be identical in pre-registered experiment
true_effects <- data.frame(medication_type = c('Blood pressure scenario', 'COVID-19 scenario'),
                           cohens_d = c((130-120)/12, (13 - 9)/4)) %>%
                           mutate(auc = pnorm(cohens_d / sqrt(2)))

2 Load and clean data

# read in responses
preprocess_df <- function(dataframe, start_varname = "ts_start", submit_varname="ts_final_submit") {
  # use combination of worker_id and assignment_id
  # as the unique participant id
  sha1 <- Vectorize(sha1)
  dataframe <- dataframe %>%
    mutate(
      original_worker_id = worker_id,
      worker_id = sha1(paste(worker_id, assignment_id, sep=""))
    )

  # get list of completed assignments
  completed_assignments <- dataframe %>%
    filter(variable == submit_varname) %>%
    distinct(worker_id)
  
  # limit to completed assignments
  # drop all timestamps except start and submit for now
  dataframe <- dataframe %>%
    inner_join(completed_assignments) %>%
    filter(variable != "id") %>%
    filter(!grepl('^ts', variable) | variable == start_varname | variable == submit_varname) 
  
  # go from turk's "long" format to "wide" format with one variable per column
  dataframe_spread <- spread(dataframe, variable, value)
  
  # parse all estimates to numbers
  dataframe <- dataframe_spread %>% 
    filter(!is.na(cost_estimate_se)) %>%
    mutate(
      superiority_estimate_se = parse_number(superiority_estimate_se),
      superiority_estimate_sd = parse_number(superiority_estimate_sd),
      cost_estimate_se = parse_number(cost_estimate_se),
      cost_estimate_sd = parse_number(cost_estimate_sd)
    )
  
  # create columns for first and second estimates based on which condition they saw first
  dataframe <- dataframe %>%
    mutate(first_superiority_estimate = (ifelse(first_condition == 'sd', superiority_estimate_sd, superiority_estimate_se)),
           second_superiority_estimate = (ifelse(first_condition == 'sd', superiority_estimate_se, superiority_estimate_sd)),
           first_cost_estimate = (ifelse(first_condition == 'sd', cost_estimate_sd, cost_estimate_se)),
           second_cost_estimate = (ifelse(first_condition == 'sd', cost_estimate_se, cost_estimate_sd))) 
  
  # create column for whether SD estimate is lower than SE estimate
  dataframe <- dataframe %>%
    mutate(superiority_estimate_sd_lower_than_se = ifelse(superiority_estimate_sd < superiority_estimate_se, T, F),
           superiority_estimate_sd_equal_to_se = ifelse(superiority_estimate_sd == superiority_estimate_se, T, F),
           superiority_estimate_sd_higher_than_se = ifelse(superiority_estimate_sd > superiority_estimate_se, T, F),
           cost_estimate_sd_lower_than_se = ifelse(cost_estimate_sd < cost_estimate_se, T, F))
  
  # convert to pretty labels for conditions
  dataframe <- dataframe %>%
    mutate(first_condition_compressed = case_when(first_condition == 'sd' ~ 'SDs',
                                       first_condition == 'se' ~ 'SEs'),
           first_condition = case_when(first_condition == 'sd' ~ 'Saw SDs first',
                                       first_condition == 'se' ~ 'Saw SEs first'),
  #         first_condition = factor(first_condition, levels = c('Saw SEs first', 'Saw SDs first')),
           medication_type = case_when(medication_type == 'blood_pressure' ~ 'Blood pressure scenario',
                                       medication_type == 'covid_19' ~ 'COVID-19 scenario'))
  
  dataframe$first_condition = factor(dataframe$first_condition, levels = c('Saw SEs first', 'Saw SDs first'))
  # extract distribution builder responses
    
  dataframe
}


df.turkers <- read_tsv(here('raw-data/mturk.tsv'), col_types="ccccc") %>%
  preprocess_df(start_varname = "ts_consent_start", submit_varname="ts_submitted_") %>%
  mutate(across(starts_with('ts_'), as.POSIXct)) %>%
  mutate(time_spent = difftime(as.POSIXct(ts_submitted_),
                               as.POSIXct(ts_consent_start),
                               units = "min"))
## Joining with `by = join_by(worker_id)`
bin_labels <- df.turkers %>%
  distinct(medication_type, distbuilder_bins) %>%
  rename(label = distbuilder_bins) %>%
  mutate(label = strsplit(as.character(label), ",")) %>%
  unnest(label) %>%
  mutate(label = gsub('to', '-', label),
         label = gsub('[^0-9-]', '', label)) %>%
  group_by(medication_type) %>%
  mutate(bin = row_number()) %>%
  extract(label, into = c("lower", "upper"), regex="([0-9]+)-([0-9]+)", remove = F) %>%
  mutate(lower = as.numeric(lower),
         upper = as.numeric(upper),
         center = (lower + upper) / 2)

There were people who completed the experiment in total.

2.1 Remove anyone who had an ill-formed response

df.turkers <- df.turkers %>% drop_na(matches('(superiority|cost)_estimate_'))
nrow(df.turkers)
## [1] 500

3 Design

This was a mixed between-subjects and within-subjects experiment.

For the between-subjects part, people were randomly assigned to see information about one of two medication types (stored in the variable medication_type): one for blood pressure or one for COVID-19.

For the within-subjects part, each person saw two different visual presentations of the information about this medication, one showing means and standard erorrs (SEs) and one showing means and standard deviations (SDs). Everyone saw the same visualizations of the same underlying data, it was just a matter of which they saw first. They were randomly assigned to see either SEs or SDs first, after which they saw the other visualization. The variable first_condition contains the visualization type they saw first.

Here are links to all four conditions:

first_condition: Saw SEs first first_condition: Saw SDs first
medication_type: blood pressure http://jhofman.github.io/medical-effects/?medication_type=blood_pressure&condition=se http://jhofman.github.io/medical-effects/?medication_type=blood_pressure&condition=sd
medication_type: covid-19 http://jhofman.github.io/medical-effects/?medication_type=covid_19&condition=se http://jhofman.github.io/medical-effects/?medication_type=covid_19&condition=sd

We also gave a text-based explanation of the underlying distributions and what the error bars meant.

When seeing SDs, people were shown this text: “Outcomes were approximately normally distributed in both groups before and after the treatment, with roughly equal percentages of patients falling above and below the average. The error bars in the figure show two standard deviations above and below the average in each group. Predictive intervals like these are constructed such that they should, in the long run, contain outcomes for 95% of similar patients in future studies.”

When seeing SEs, people were shown this instead: “Outcomes were approximately normally distributed in both groups before and after the treatment, with roughly equal percentages of patients falling above and below the average. The error bars in the figure show two standard errors above and below the average in each group. Confidence intervals like these are constructed such that 95% percent of them should, in the long run, contain the true average for similar patients in future studies.”

4 Sanity checks

4.1 Randomization check

Randomization looks to be roughly balanced across conditions. The COVID-19 / SE first condition counts are a bit low, small chance that fewer people are completing that condition but probably just a fluke and not a statistically significant difference.

counts <- df.turkers %>%
  count(medication_type, first_condition)

kable(counts)
medication_type first_condition n
Blood pressure scenario Saw SEs first 134
Blood pressure scenario Saw SDs first 117
COVID-19 scenario Saw SEs first 123
COVID-19 scenario Saw SDs first 126
chisq.test(counts$n)
## 
##  Chi-squared test for given probabilities
## 
## data:  counts$n
## X-squared = 1.2, df = 3, p-value = 0.753

4.2 Time spent on the experiment

The median time to complete the experiment was 8.96666666666667 minutes. The plot below is limited to participants who completed in 100 minutes or less, excluding the top 0 percent of outliers in terms of completion time.

ggplot(df.turkers, aes(x = time_spent)) +
  geom_histogram() +
  facet_wrap(~ medication_type) +
  labs(x = "Time spent in minutes",
       y = "Number of participants",
       title = "Time spent on the experiment by medication scenario") +
  xlim(c(0,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

5 Primary hypotheses

5.1 Estimated probability of superiority

After seeing information about the medication along with a figure, we asked people to estimate the probability of superiority for the treatment over the control condition. Specifically, we asked “What is your best estimate of the probability that a randomly selected patient in the treatment group recovered more quickly than a randomly selected patient in the control group? (A 50% probability would indicate no difference in outcomes between the treatment and control groups.)”

5.1.1 Between-subjects effect of first visualization seen on first probability of superiority estimate

This looks at just the between-subjects effect that seeing SEs vs. SDs had by comparing people’s responses to only the first visualization condition they saw, ignoring the second estimates. For both medication types, people estimated the probability of superiority to be higher, on average, when they saw visualizations with SEs than with SDs.

5.1.1.1 Plots

This is plotted in two ways below, first means + one standard error, and then full distributions of responses. In the second type of plot the means and standard errors are indicated by points and horizontal error bars.

It’s interesting—and a bit puzzling—to see how the average estimates compare to the actual probabilities of superiorities, indicated by the dashed lines in each panel.

The distributions show that SEs result in many more “close to 100%” type responses.

turkers_estimated_psup <- plot_beeswarm_two_conditions_with_truth(df.turkers, true_effects, "first_condition", "first_superiority_estimate") +
  facet_wrap(~ medication_type) +
  labs(y="Estimated probability of superiority") +
  xlab("First visualization seen")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
turkers_estimated_psup

ggsave("figures/turkers_estimated_psup.pdf", width=6, height=4)
summary_stats_providers <- df.turkers %>% group_by(medication_type, first_condition) %>%
  summarize(mu=mean(first_superiority_estimate),
            se=sd(first_superiority_estimate)/sqrt(n()))
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
blood_se_first_mu <- (summary_stats_providers %>% filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SEs first"))$mu
blood_sd_first_mu <- (summary_stats_providers %>% filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SDs first"))$mu
covid_se_first_mu <- (summary_stats_providers %>% filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SEs first"))$mu
covid_sd_first_mu <- (summary_stats_providers %>% filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SDs first"))$mu

blood_psup_test <- (df.turkers %>%
  filter(medication_type == "Blood pressure scenario") %>%
  t.test(first_superiority_estimate ~ first_condition, data = .) %>%
  apa_custom())$statistic %>%
  remove_dollar_signs()

covid_psup_test <- (df.turkers %>%
  filter(medication_type == "COVID-19 scenario") %>%
  t.test(first_superiority_estimate ~ first_condition, data = .) %>%
  apa_custom())$statistic %>%
  remove_dollar_signs()
  
# Extreme values

blood_extreme_se_perc <- round(100*(df.turkers %>%
  filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SEs first") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  summarize(prop_extreme = mean(extreme)))$prop_extreme)

blood_extreme_sd_perc <- round(100*(df.turkers %>%
  filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SDs first") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  summarize(prop_extreme = mean(extreme)))$prop_extreme)

covid_extreme_se_perc <- round(100*(df.turkers %>%
  filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SEs first") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  summarize(prop_extreme = mean(extreme)))$prop_extreme)

covid_extreme_sd_perc <- round(100*(df.turkers %>%
  filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SDs first") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  summarize(prop_extreme = mean(extreme)))$prop_extreme)

blood_extreme_test <- (df.turkers %>%
  filter(medication_type == "Blood pressure scenario") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  t.test(extreme ~ first_condition, data=.) %>%
  apa_custom())$statistic %>%
  remove_dollar_signs()

covid_extreme_test <- (df.turkers %>%
  filter(medication_type == "COVID-19 scenario") %>%
  mutate(extreme = first_superiority_estimate > 90) %>%
  t.test(extreme ~ first_condition, data=.) %>%
  apa_custom())$statistic %>%
  remove_dollar_signs()

c(blood_psup_test, covid_psup_test, blood_extreme_test, covid_extreme_test)
## [1] "t(244.8) = 4.86, p < .001" "t(229.9) = 4.54, p < .001"
## [3] "t(202.2) = 4.02, p < .001" "t(145.5) = 4.77, p < .001"

5.1.1.2 Significance tests

Our primary, confirmatory hypothesis is that participants will estimate a greater probability of superiority for the treatment when the results of a hypothetical RCT communicate inferential uncertainty (i.e., by displaying standard errors around a point estimate) than when these results communicate outcome uncertainty (i.e., by displaying standard deviations).

The first set of tests of this hypothesis is a between-subjects test using two separate, independent-groups t-tests. Specifically, within each of the two medical scenarios, we compare probability of superiority estimates between participants who see the inferential uncertainty condition first to participants who see the outcome uncertainty condition first.

df.turkers %>% 
  group_by(medication_type) %>% 
  do(tidy(t.test(first_superiority_estimate ~ first_condition,
                 data = .,
                 conf.level = 0.95,
                 alternative = c("greater")))) %>%
  kable()
medication_type estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
Blood pressure scenario 8.416188 75.72388 67.30769 4.859116 1.1e-06 244.8105 5.556413 Inf Welch Two Sample t-test greater
COVID-19 scenario 7.446767 76.43089 68.98413 4.543675 4.5e-06 229.8704 4.740058 Inf Welch Two Sample t-test greater

We see a statistically significant difference: in both medical scenarios, those who saw SEs first gave higher estimates, on average, than those who saw SDs first. This supports our primary hypothesis.

5.1.2 Within-subjects effect of seeing one visualization vs. the other on probability of superiority

This considers both estimates that people made, comparing their estimated probability of superiority for the SD visualization to the same estimate for the SE visualization. This is broken down by which medication type they were shown and which visualization they saw first.

5.1.2.1 Plots

Regardless of which visualization people saw first, they estimated the probability of superiority to be higher when they saw SEs compared to SDs.

plot_data_turkers <- df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type) %>%
  summarize(mu = mean(diff_superiority_se_vs_sd),
            se = sd(diff_superiority_se_vs_sd) / sqrt(n())) %>% 
  mutate(y = 1:n())

kable(plot_data_turkers)
medication_type mu se y
Blood pressure scenario 8.533864 0.9622555 1
COVID-19 scenario 7.433735 0.9255478 2
df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  ggplot(aes(x=medication_type, y=diff_superiority_se_vs_sd)) +
    geom_point(mapping=aes(color=medication_type), position = position_jitter(width=0.1)) +
  #  scale_color_manual(values=c("#af8dc3", "#7fbf7b")) +
    stat_summary(fun = mean, geom = "point", size=1.2) +
    stat_summary(fun.data="mean_se",  fun.args = list(mult=1), 
                 geom="errorbar", color = "black", width=0.1) +
    theme(legend.position = "none") +
  geom_hline(yintercept = 0, linetype = 'dashed') +
  xlab("Scenario") +
  ylab("Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs")

ggsave(here("analysis/figures/turkers_within_subjects_drop_in_psup.pdf"), width=4, height=4)
within_subjects_diff_statistics <- df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type) %>%
  summarize(
    mu=mean(diff_superiority_se_vs_sd),
    se=sd(diff_superiority_se_vs_sd)/sqrt(n())
  )

blood_within_mu <- (within_subjects_diff_statistics %>% filter(medication_type == "Blood pressure scenario"))$mu
blood_within_se <- (within_subjects_diff_statistics %>% filter(medication_type == "Blood pressure scenario"))$se
covid_within_mu <- (within_subjects_diff_statistics %>% filter(medication_type == "COVID-19 scenario"))$mu
covid_within_se <- (within_subjects_diff_statistics %>% filter(medication_type == "COVID-19 scenario"))$se

ggplot(plot_data_turkers, aes(x = medication_type, y = mu)) +
  geom_pointrange(aes(ymin = mu - se, ymax = mu + se)) +
  geom_hline(yintercept = 0, linetype = 'dashed') +
  #facet_wrap(~ medication_type, scale = "free_x") +
  #scale_y_continuous(lim = c(0, 10)) +
  labs(x = '',
       y = 'Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs',
       title = 'Average drop in probability of superiority estimate when seeing SDs vs. SEs',
       subtitle = 'Bars show one standard error') +
  coord_flip()

scale <- 0.7
df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  ggplot(aes(x = diff_superiority_se_vs_sd, y = medication_type)) +
  geom_density_ridges(stat = "binline", binwidth = 1, scale = scale) +
  geom_errorbarh(data = plot_data_turkers, aes(x = mu, xmin = mu - se, xmax = mu + se, y = y + scale), height = 0.1) +
  geom_point(data = plot_data_turkers, aes(x = mu, y = y + scale)) +
  coord_cartesian(ylim = c(1.25, 2.25)) +
  labs(x = 'Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs',
       y = 'First visualization seen',
       title = 'Distribution of drop in probability of superiority estimate\nwhen seeing SDs vs. SEs') +
  #facet_wrap(~ medication_type, ncol = 1) +
  theme(legend.position = "none")
## Warning in geom_errorbarh(data = plot_data_turkers, aes(x = mu, xmin = mu - :
## Ignoring unknown aesthetics: x

We can also look at the full set of responses. For each person we plot their two estimates (one for when they saw SDs and one for when they saw SEs) connected by a line. The slope of the line indicates how much they changed their estimates, with a positive slope indicating that the estimate when seeing SEs was higher than when seeing SDs. Negatively-sloped lines are colored blue, and as hypothesized, the majority of people revised in this direction (somewhere between 60 and 70 percent of people, depending on the condition).

plot_data <- df.turkers %>%
  select(worker_id, medication_type, first_condition,
         superiority_estimate_sd_lower_than_se,
         superiority_estimate_sd_equal_to_se,
         superiority_estimate_sd_higher_than_se,
         superiority_estimate_sd, superiority_estimate_se) %>%
  gather("variable", "value", superiority_estimate_sd, superiority_estimate_se) %>%
  mutate(variable = case_when(variable == 'superiority_estimate_sd' ~ 'When seeing SDs',
                              variable == 'superiority_estimate_se' ~ 'When seeing SEs'),
         variable = factor(variable, levels = c('When seeing SEs', 'When seeing SDs')))

plot_data %>%
  group_by(medication_type, first_condition) %>%
  summarize(frac_sd_lower_than_se = mean(superiority_estimate_sd_lower_than_se),
            frac_sd_equal_to_se = mean(superiority_estimate_sd_equal_to_se),
            frac_sd_higher_than_se = mean(superiority_estimate_sd_higher_than_se)) %>%
  kable()
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
medication_type first_condition frac_sd_lower_than_se frac_sd_equal_to_se frac_sd_higher_than_se
Blood pressure scenario Saw SEs first 0.5970149 0.2164179 0.1865672
Blood pressure scenario Saw SDs first 0.6153846 0.1965812 0.1880342
COVID-19 scenario Saw SEs first 0.5284553 0.2520325 0.2195122
COVID-19 scenario Saw SDs first 0.5793651 0.2222222 0.1984127
ggplot(plot_data, aes(x = variable, y = value, group = worker_id, color = superiority_estimate_sd_lower_than_se)) +
  geom_point() +
  geom_line(alpha = 0.25) +
  facet_grid(medication_type ~ first_condition) +
  labs(x = '',
       y = 'Estimated probability of superiority',
       title = 'Change in probability of superiority estimates when seeing SEs vs. SDs') +
  theme(legend.position = "none")

5.1.2.2 Significance tests

The second set of tests of our primary hypothesis is a within-subjects test using two separate, independent-groups t-tests. Specifically, within each of the two medical scenarios, we compute a within-participant change score as participants’ probability of superiority estimate in the inferential uncertainty condition minus their estimate in the outcome uncertainty condition. We test whether there is statistical significance for these change scores being greater than zero.

df.turkers %>% 
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type) %>% 
  do(tidy(t.test(x = .$diff_superiority_se_vs_sd,
                 mu = 0,
                 conf.level = 0.95,
                 type = "two.sample",
                 alternative = c("greater")))) %>%
  kable()
medication_type estimate statistic p.value parameter conf.low conf.high method alternative
Blood pressure scenario 8.533864 8.868606 0 250 6.945208 Inf One Sample t-test greater
COVID-19 scenario 7.433735 8.031714 0 248 5.905636 Inf One Sample t-test greater

We see a statistically significant difference: in both medical scenarios, we see that’s people’s estimates for the probability of superiority were higher, on average, when they made the estimate after seeing SEs compared to making the estimate after seeing SDs, leading to a positive change score. This supports our primary hypothesis.

An alternative is to just look at the fraction of people who had a lower estimate when they saw SDs compared to when they saw SEs, implemented as a sign test. This ignores the magnitude of changes and just considers direction.

df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type) %>%
  do(tidy(DescTools::SignTest(.$diff_superiority_se_vs_sd, mu = 0)))
## # A tibble: 2 × 9
## # Groups:   medication_type [2]
##   medication_type         estimate statistic  p.value parameter conf.low.lwr.ci
##   <chr>                      <dbl>     <dbl>    <dbl>     <dbl>           <dbl>
## 1 Blood pressure scenario        5       152 4.16e-14       199               5
## 2 COVID-19 scenario              5       138 3.50e-10       190               0
## # ℹ 3 more variables: conf.high.upr.ci <dbl>, method <chr>, alternative <chr>

Again we see a statistically significant difference for both scenarios.

6 Additional analyses

6.1 Estimated probability of superiority

6.1.1 Extreme responses for first probability of superiority estimate

Now we compare extreme responses for participant’s probability of superiority estimate. Specifically, what fraction of people in each condition gave a higher than 90% estimate for the first visualization they saw?

df.turkers %>%
  group_by(medication_type, first_condition) %>%
  summarize(frac_greater_than_90_percent = mean(first_superiority_estimate > 90)) %>%
  kable()
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
medication_type first_condition frac_greater_than_90_percent
Blood pressure scenario Saw SEs first 0.2014925
Blood pressure scenario Saw SDs first 0.0427350
COVID-19 scenario Saw SEs first 0.1951220
COVID-19 scenario Saw SDs first 0.0158730

We see that while only a few people who saw SDs estimated the probability of superiority to be over 90%, roughly 20% of people who were shown SEs did so.

6.1.2 Mean absolute error in first probability of superiority estimate

Now we compare probability of superiority estimates to their true value, as calculated from the Cohen’s D value that participants could have inferred from the visualizations if they could do so perfectly.

kable(true_effects)
medication_type cohens_d auc
Blood pressure scenario 0.8333333 0.7221551
COVID-19 scenario 1.0000000 0.7602499
plot_data <- df.turkers %>%
  left_join(true_effects) %>%
  mutate(abs_err = abs(first_superiority_estimate - 100*auc)) %>%
  select(worker_id, medication_type, first_condition, abs_err) %>%
  group_by(medication_type, first_condition) %>%
  summarize(mu = mean(abs_err),
            se = sd(abs_err) / sqrt(n())) %>% 
  group_by(medication_type) %>%
  mutate(y = 1:n())
## Joining with `by = join_by(medication_type)`
## `summarise()` has grouped output by 'medication_type'. You can override using the `.groups` argument.
kable(plot_data)
medication_type first_condition mu se y
Blood pressure scenario Saw SEs first 13.52110 0.7184128 1
Blood pressure scenario Saw SDs first 10.74705 0.6432486 2
COVID-19 scenario Saw SEs first 11.50549 0.7807899 1
COVID-19 scenario Saw SDs first 10.71190 0.6865797 2
ggplot(plot_data, aes(x = first_condition, y = mu)) +
  geom_hline(data = true_effects, aes(yintercept = 0), linetype = 'dashed') +
  geom_pointrange(aes(ymin = mu - se, ymax = mu + se)) +
  facet_wrap(~ medication_type) +
  labs(x = 'First visualization seen',
       y = 'MAE in estimated probability of superiority\nfor first visualization seen',
       title = 'Error in probability of superiority estimate for first visualization seen',
       subtitle = 'Bars show one standard error')

6.1.3 Within-subjects effect of seeing one visualization vs. the other on probability of superiority, split by which was seen first

We can repeat our analysis of the within-subjects change in estimates from above, but splitting by which type of visualization people saw first, and we see some evidence suggesting that the drop in probability of superiority estimates is slightly larger for the people who saw SEs first, although doesn’t appear to be statistically significant.

plot_data <- df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type, first_condition) %>%
  summarize(mu = mean(diff_superiority_se_vs_sd),
            se = sd(diff_superiority_se_vs_sd) / sqrt(n())) %>% 
  group_by(medication_type) %>%
  mutate(y = 1:n())
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
kable(plot_data)
medication_type first_condition mu se y
Blood pressure scenario Saw SEs first 9.179104 1.326163 1
Blood pressure scenario Saw SDs first 7.794872 1.400999 2
COVID-19 scenario Saw SEs first 8.764228 1.435823 1
COVID-19 scenario Saw SDs first 6.134921 1.169254 2
ggplot(plot_data, aes(x = first_condition, y = mu)) +
  geom_pointrange(aes(ymin = mu - se, ymax = mu + se)) +
  geom_hline(yintercept = 0, linetype = 'dashed') +
  facet_wrap(~ medication_type) +
  labs(x = 'First visualization seen',
       y = 'Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs',
       title = 'Average drop in probability of superiority estimate when seeing SDs vs. SEs',
       subtitle = 'Bars show one standard error')

scale <- 0.7
df.turkers %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  ggplot(aes(x = diff_superiority_se_vs_sd, y = first_condition, color = first_condition, fill = first_condition)) +
  geom_density_ridges(stat = "binline", binwidth = 1, scale = scale) +
  geom_errorbarh(data = plot_data, aes(x = mu, xmin = mu - se, xmax = mu + se, y = y + scale), height = 0.1) +
  geom_point(data = plot_data, aes(x = mu, y = y + scale)) +
  coord_cartesian(ylim = c(1.25, 2.25)) +
  labs(x = 'Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs',
       y = 'First visualization seen',
       title = 'Distribution of drop in probability of superiority estimate\nwhen seeing SDs vs. SEs') +
  facet_wrap(~ medication_type, ncol = 1) +
  theme(legend.position = "none")
## Warning in geom_errorbarh(data = plot_data, aes(x = mu, xmin = mu - se, :
## Ignoring unknown aesthetics: x

6.2 Estimated value of the treatment

We also asked people to estimate the probability of superiority for the treatment over the control condition. We gave them prices to anchor around to reduce noise in estimates.

For blood pressure we phrased this as follows: “Based on its effectiveness and the price of other medications, how much do you think a 30 day supply of this new medication would be worth to patients with high blood pressure? For reference, a 30 day supply of generic ACE inhibitors, also used to lower blood pressure, costs about $20.”

For COVID-19 we phrased this as: “Based on its effectiveness and the price of other medications, how much do you think a 5 day supply of this new medication would be worth to patients hospitalized with COVID-19? For reference, a 5 day supply of the most popular drug used for this purpose costs about $2500.”

6.2.1 Between-subjects effect of first visualization seen on value of treatment estimate

Note that instead of looking at means (as with probability of superiority above), we look at medians here to reduce sensitivity to a handful of extreme responses.

plot_data_value <- df.turkers %>%
  mutate(direction = ifelse(cost_estimate_sd < cost_estimate_se, T, F)) %>%
  select(worker_id, medication_type, first_condition, direction, first_cost_estimate, second_cost_estimate) %>%
  group_by(medication_type, first_condition) %>%
  do(bootstrapped_median_ci(., "first_cost_estimate", reps = 10000, conf = 0.95)) %>%
  group_by(medication_type) %>%
  mutate(y = 1:n())

estimated_value_blood_se <- plot_data_value %>% filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SEs first")
estimated_value_blood_sd <- plot_data_value %>% filter(medication_type == "Blood pressure scenario" & first_condition == "Saw SDs first")
estimated_value_covid_se <- plot_data_value %>% filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SEs first")
estimated_value_covid_sd <- plot_data_value %>% filter(medication_type == "COVID-19 scenario" & first_condition == "Saw SDs first")

kable(plot_data_value)
medication_type first_condition lower median upper y
Blood pressure scenario Saw SEs first 30 30 32.5 1
Blood pressure scenario Saw SDs first 26 28 31.0 2
COVID-19 scenario Saw SEs first 1500 2000 2200.0 1
COVID-19 scenario Saw SDs first 2500 2500 3000.0 2
ggplot(plot_data_value, aes(x = first_condition, y = median)) +
  geom_pointrange(aes(ymin = lower, ymax = upper)) +
  facet_wrap(~ medication_type, scale = "free_y") +
  labs(x = 'First visualization seen',
       y = 'Estimated value of treatment\nfor first visualization seen',
       title = 'Median value of treatment estimate for first visualization seen',
       subtitle = 'Bars show bootstrapped 95% confidence interval')

# remove outliers for plot
df_trimmed <- df.turkers %>%
  group_by(medication_type) %>%
  filter(cost_estimate_sd > quantile(cost_estimate_sd, 0.025) &
           cost_estimate_sd < quantile(cost_estimate_sd, 0.975) &
           cost_estimate_se > quantile(cost_estimate_se, 0.025) &
           cost_estimate_se < quantile(cost_estimate_se, 0.975) )

scale <- 0.7
ggplot(df_trimmed, aes(x = first_cost_estimate, y = first_condition, color = first_condition, fill = first_condition)) +
  geom_density_ridges(stat = "binline", binwidth = 1, scale = scale) +
  geom_errorbarh(data = plot_data_value, aes(x = median, xmin = lower, xmax = upper, y = y + scale), height = 0.1) +
  geom_point(data = plot_data_value, aes(x = median, y = y + scale)) +
  coord_cartesian(ylim = c(1.25, 2.25)) +
  labs(x = 'Estimated value of treatment\nfor first visualization seen',
       y = 'First visualization seen',
       title = 'Distribution of value of treatment estimates for first visualization seen',
       subtitle = 'Points and bars show medians with 95% bootstrapped confidence intervals') +
  facet_wrap(~ medication_type, ncol = 1, scale = "free") +
  theme(legend.position = "none")
## Warning in geom_errorbarh(data = plot_data_value, aes(x = median, xmin = lower,
## : Ignoring unknown aesthetics: x

Note: in the distribution plot above we’ve removed the top and bottom 2.5% of outliers within each scenario so the scales are reasonable.

ggplot(df.turkers, aes(x = log10(first_cost_estimate), y = first_condition, color = first_condition, fill = first_condition)) +
  geom_density_ridges(stat = "binline", binwidth = 1, scale = scale) +
  geom_errorbarh(data = plot_data_value, aes(x = log10(median), xmin = log10(lower), xmax = log10(upper), y = y + 1), height = 0.1) +
  geom_point(data = plot_data_value, aes(x = log10(median), y = y + 1)) +
  coord_cartesian(ylim = c(1.25, 2.75)) +
  labs(x = 'Estimated log10 value of treatment\nfor first visualization seen',
       y = 'First visualization seen',
       title = 'Distribution of value of treatment estimates for first visualization seen',
       subtitle = 'Points and bars show medians with 95% bootstrapped confidence intervals') +
  facet_wrap(~ medication_type, ncol = 1, scale = "free") +
  theme(legend.position = "none")
## Warning in geom_errorbarh(data = plot_data_value, aes(x = log10(median), :
## Ignoring unknown aesthetics: x
## Warning: Removed 1 rows containing non-finite values (`stat_binline()`).

dollars <- function(x) {
  paste("$", comma(x, accuracy=1), sep="")
}

turkers_value_of_treatment <- ggplot(df.turkers, aes(x=first_condition, y = first_cost_estimate)) +
      geom_point(aes(color = first_condition), alpha = 0.70, position = position_jitter(width = 0.25, height = 0.15)) +
      stat_summary(fun = mean, geom = "point", size=1.2) +
      stat_summary(fun.data="mean_se",  fun.args = list(mult=1), 
                   geom="errorbar", color = "black", width=0.1) +
      facet_wrap(~ medication_type, ncol = 2, scale="free") +
      scale_y_log10(breaks = c(1,3,10,30,100,1000, 3000, 10000, 100000), labels = dollars) +
      scale_color_discrete(guide = "none") +
    labs(y = 'Estimated value of treatment\nfor first visualization seen',
         x = '')
turkers_value_of_treatment
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1 rows containing non-finite values (`stat_summary()`).
## Removed 1 rows containing non-finite values (`stat_summary()`).

ggsave(here("analysis/figures/turkers_estimated_value_log10.pdf"), width=6, height=4)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1 rows containing non-finite values (`stat_summary()`).
## Removed 1 rows containing non-finite values (`stat_summary()`).
print("Blood pressure")
## [1] "Blood pressure"
blood.median.test <- df.turkers %>%
  filter(medication_type == "Blood pressure scenario") %>%
  median_test(first_cost_estimate ~ first_condition, data = ., alternative = "greater")

blood.median.test.pval <- pvalue(blood.median.test)
blood.median.test.zval <- statistic(blood.median.test)

print("COVID-19")
## [1] "COVID-19"
covid.median.test <- df.turkers %>%
  filter(medication_type == "COVID-19 scenario") %>%
  median_test(first_cost_estimate ~ first_condition, data = ., alternative = "greater")

covid.median.test.pval <- pvalue(covid.median.test)
covid.median.test.zval <- statistic(covid.median.test)

c(blood.median.test, covid.median.test)
## [[1]]
## 
##  Asymptotic Two-Sample Brown-Mood Median Test
## 
## data:  first_cost_estimate by
##   first_condition (Saw SEs first, Saw SDs first)
## Z = -0.065217, p-value = 0.526
## alternative hypothesis: true mu is greater than 0
## 
## 
## [[2]]
## 
##  Asymptotic Two-Sample Brown-Mood Median Test
## 
## data:  first_cost_estimate by
##   first_condition (Saw SEs first, Saw SDs first)
## Z = -0.57001, p-value = 0.7157
## alternative hypothesis: true mu is greater than 0

6.2.2 Within-subjects effect of seeing one visualization vs. the other on estimated value of treatment

plot_data <- df.turkers %>%
  mutate(diff_cost_se_vs_sd = cost_estimate_se - cost_estimate_sd) %>%
  group_by(medication_type, first_condition) %>%
  do(bootstrapped_median_ci(., "diff_cost_se_vs_sd", reps = 10000, conf = 0.95)) %>%
  group_by(medication_type) %>%
  mutate(y = 1:n())

kable(plot_data)
medication_type first_condition lower median upper y
Blood pressure scenario Saw SEs first -3 1 2 1
Blood pressure scenario Saw SDs first -5 0 0 2
COVID-19 scenario Saw SEs first -150 0 0 1
COVID-19 scenario Saw SDs first -5 0 0 2
ggplot(plot_data, aes(x = first_condition, y = median)) +
  geom_pointrange(aes(ymin = lower, ymax = upper)) +
  facet_wrap(~ medication_type, scale = "free_y") +
  labs(x = 'First visualization seen',
       y = 'Drop in estimated value of treatment\nwhen seeing SDs vs. SEs',
       title = 'Average drop in value of treatment estimate when seeing SDs vs. SEs',
       subtitle = 'Bars show one standard error')

scale <- 0.7
df_trimmed %>%
  mutate(diff_cost_se_vs_sd = cost_estimate_se - cost_estimate_sd) %>%
  ggplot(aes(x = diff_cost_se_vs_sd, y = first_condition, color = first_condition, fill = first_condition)) +
  geom_density_ridges(stat = "binline", binwidth = 1, scale = scale) +
  geom_errorbarh(data = plot_data, aes(x = median, xmin = lower, xmax = upper, y = y + scale), height = 0.1) +
  geom_point(data = plot_data, aes(x = median, y = y + scale)) +
  coord_cartesian(ylim = c(1.25, 2.25)) +
  labs(x = 'Drop in estimated value of the treatment\nwhen seeing SDs vs. SEs',
       y = 'First visualization seen',
       title = 'Distribution of drop in value of treatment estimate\nwhen seeing SDs vs. SEs') +
  facet_wrap(~ medication_type, scale = "free_x", ncol = 1) +
  theme(legend.position = "none")
## Warning in geom_errorbarh(data = plot_data, aes(x = median, xmin = lower, :
## Ignoring unknown aesthetics: x

plot_data <- df.turkers %>%
  group_by(medication_type) %>%
  filter(cost_estimate_sd > quantile(cost_estimate_sd, 0.025) &
           cost_estimate_sd < quantile(cost_estimate_sd, 0.975) &
           cost_estimate_se > quantile(cost_estimate_se, 0.025) &
           cost_estimate_se < quantile(cost_estimate_se, 0.975) ) %>%
  select(worker_id, medication_type, first_condition, cost_estimate_sd_lower_than_se, cost_estimate_sd, cost_estimate_se) %>%
  gather("variable", "value", cost_estimate_sd, cost_estimate_se) %>%
  mutate(variable = case_when(variable == 'cost_estimate_sd' ~ 'When seeing SDs',
                              variable == 'cost_estimate_se' ~ 'When seeing SEs'),
         variable = factor(variable, levels = c('When seeing SEs', 'When seeing SDs')))

(plot_data %>%
    group_by(medication_type, first_condition) %>%
    summarize(mean(cost_estimate_sd_lower_than_se)))
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   medication_type [2]
##   medication_type         first_condition `mean(cost_estimate_sd_lower_than_se)`
##   <chr>                   <fct>                                            <dbl>
## 1 Blood pressure scenario Saw SEs first                                    0.5  
## 2 Blood pressure scenario Saw SDs first                                    0.471
## 3 COVID-19 scenario       Saw SEs first                                    0.495
## 4 COVID-19 scenario       Saw SDs first                                    0.402
ggplot(plot_data, aes(x = variable, y = value, group = worker_id, color = cost_estimate_sd_lower_than_se)) +
  geom_point() +
  geom_line(alpha = 0.25) +
  facet_grid(medication_type ~ first_condition, scale = "free_y") +
  labs(x = '',
       y = 'Estimated value of the treatment',
       title = 'Change in value of treatment estimates when seeing SEs vs. SDs') +
  theme(legend.position = "none")

print("Blood pressure")
## [1] "Blood pressure"
df.turkers %>%
  mutate(diff_cost_se_vs_sd = cost_estimate_se - cost_estimate_sd) %>%
  filter(medication_type == "Blood pressure scenario") %>%
  median_test(diff_cost_se_vs_sd ~ first_condition, data = ., alternative = "greater")
## 
##  Asymptotic Two-Sample Brown-Mood Median Test
## 
## data:  diff_cost_se_vs_sd by
##   first_condition (Saw SEs first, Saw SDs first)
## Z = 0.87757, p-value = 0.1901
## alternative hypothesis: true mu is greater than 0
print("COVID-19")
## [1] "COVID-19"
df.turkers %>%
  mutate(diff_cost_se_vs_sd = cost_estimate_se - cost_estimate_sd) %>%
  filter(medication_type == "COVID-19 scenario") %>%
  median_test(diff_cost_se_vs_sd ~ first_condition, data = ., alternative = "greater")
## 
##  Asymptotic Two-Sample Brown-Mood Median Test
## 
## data:  diff_cost_se_vs_sd by
##   first_condition (Saw SEs first, Saw SDs first)
## Z = 1.0623, p-value = 0.1441
## alternative hypothesis: true mu is greater than 0

6.3 Recall of error bar type / meaning

After seeing each visualization and making two estimates (one for probability of superiority and the other for value), people were asked “What kind of error bars were shown in the plot on the previous page?” and given the choices of “Standard errors, showing uncertainty in estimating the average in each condition” as the first choice or “Standard deviations, showing the variation in individual outcomes in each condition” as the second choice.

6.3.1 Fraction of people who correctly answered the “what you saw” question the first time around

First let’s look at how people did just the first time around, based on the first visualization they saw. People seem to get this correct more often when seeing SD visualizations, perhaps because they naturally think that’s what error bars represent.

df.turkers %>%
  group_by(medication_type, first_condition) %>%
  mutate(passed_first_check = ifelse(first_condition == 'Saw SEs first',
                                     grepl('se$', what_you_saw_se),
                                     grepl('sd$', what_you_saw_sd))) %>%
  summarize(frac_correct = mean(passed_first_check)) %>%
  kable()
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
medication_type first_condition frac_correct
Blood pressure scenario Saw SEs first 0.4626866
Blood pressure scenario Saw SDs first 0.8205128
COVID-19 scenario Saw SEs first 0.5691057
COVID-19 scenario Saw SDs first 0.7936508

6.3.2 Fraction of people who correctly answered the “what you saw” question both times

A lower fraction of people got this question right both times.

df.turkers %>%
  group_by(medication_type, first_condition) %>%
  summarize(frac_correct = mean(grepl('sd$', what_you_saw_sd) & grepl('se$', what_you_saw_se))) %>%
  kable()
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
medication_type first_condition frac_correct
Blood pressure scenario Saw SEs first 0.4179104
Blood pressure scenario Saw SDs first 0.6239316
COVID-19 scenario Saw SEs first 0.4634146
COVID-19 scenario Saw SDs first 0.6587302

6.4 Re-running significance tests for primary hypotheses restricted to people who correctly answered the “what you saw” question both times

As a more stringent test of our primary hypotheses, we limit our attention to participants who correctly answered both questions about what type of error bars they saw. Presumably these people were more aware of what the visualizations they were shown meant. Thus if we see an effect here, this provides more evidence that differences between conditions are due to more than simply confusion about what type of error bars were shown.

6.4.1 Percent of subjects who recalled seeing SEs by first condition

df_recalled_condition_sd <- df.turkers %>%
  filter(first_condition_compressed == "SDs") %>%
  mutate(recalled_se = grepl('sd$', what_you_saw_sd)) %>%
  summarize(recalled_se_mean = mean(recalled_se),
            recalled_se_sd = sd(recalled_se),
            recalled_se_se = recalled_se_sd/sqrt(n()),
            condition="SDs")

df_recalled_condition_se <- df.turkers %>%
  filter(first_condition_compressed == "SEs") %>%
  mutate(recalled_se = grepl('se$', what_you_saw_se)) %>%
  summarize(recalled_se_mean = mean(recalled_se),
            recalled_se_sd = sd(recalled_se),
            recalled_se_se = recalled_se_sd/sqrt(n()),
            condition="SEs")

df_recalled_condition <- rbind(df_recalled_condition_sd, df_recalled_condition_se)

provider_recall_SE_mean <- format(100*signif(df_recalled_condition_se$recalled_se_mean, 3))
provider_recall_SE_mean_95_CI_lower <- format(100*signif(df_recalled_condition_se$recalled_se_mean - 1.96*df_recalled_condition_se$recalled_se_se, 3))
provider_recall_SE_mean_95_CI_upper <- format(100*signif(df_recalled_condition_se$recalled_se_mean + 1.96*df_recalled_condition_se$recalled_se_se, 3))

ggplot(df_recalled_condition, aes(x=condition, y=recalled_se_mean, color=condition)) +
  geom_pointrange(aes(ymin=recalled_se_mean - recalled_se_se, ymax=recalled_se_mean + recalled_se_se)) +
   scale_y_continuous(label=percent, lim=c(0, 1)) +
      theme(legend.position="none") +
    labs(y="% who recalled correct meaning of error bars", x="First saw condition")

ggsave("./figures/turkers-first-saw-recalled.pdf", width=4, height=4)

6.4.2 Between-subjects effect of first visualization seen on first probability of superiority estimate

df_recalled_correct <- df.turkers %>%
  filter(grepl('sd$', what_you_saw_sd) & grepl('se$', what_you_saw_se)) 

plot_data <- df_recalled_correct %>%
  mutate(direction = ifelse(superiority_estimate_sd < superiority_estimate_se, T, F)) %>%
  select(worker_id, medication_type, first_condition, direction, first_superiority_estimate, second_superiority_estimate) %>%
  group_by(medication_type, first_condition) %>%
  summarize(mu = mean(first_superiority_estimate),
            se = sd(first_superiority_estimate) / sqrt(n())) %>% 
  group_by(medication_type) %>%
  mutate(y = 1:n())
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
kable(plot_data)
medication_type first_condition mu se y
Blood pressure scenario Saw SEs first 78.51786 1.999187 1
Blood pressure scenario Saw SDs first 67.58904 1.281058 2
COVID-19 scenario Saw SEs first 78.05263 1.760828 1
COVID-19 scenario Saw SDs first 69.53012 1.070994 2
ggplot(plot_data, aes(x = first_condition, y = mu)) +
  geom_hline(data = true_effects, aes(yintercept = 100*auc), linetype = 'dashed') +
  geom_pointrange(aes(ymin = mu - se, ymax = mu + se)) +
  facet_wrap(~ medication_type) +
  labs(x = 'First visualization seen',
       y = 'Estimated probability of superiority\nfor first visualization seen',
       title = 'Average probability of superiority estimate for first visualization seen',
       subtitle = 'Bars show one standard error')

df_recalled_correct %>%
  group_by(medication_type) %>% 
  do(tidy(t.test(first_superiority_estimate ~ first_condition,
                 data = .,
                 conf.level = 0.95,
                 alternative = c("greater")))) %>%
  kable()
medication_type estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
Blood pressure scenario 10.928816 78.51786 67.58904 4.602736 6.3e-06 96.95341 6.985569 Inf Welch Two Sample t-test greater
COVID-19 scenario 8.522511 78.05263 69.53012 4.135221 3.8e-05 96.11488 5.099546 Inf Welch Two Sample t-test greater

6.4.3 Within-subjects effect of seeing one visualization vs. the other on probability of superiority

plot_data <- df_recalled_correct %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type, first_condition) %>%
  summarize(mu = mean(diff_superiority_se_vs_sd),
            se = sd(diff_superiority_se_vs_sd) / sqrt(n())) %>% 
  group_by(medication_type) %>%
  mutate(y = 1:n())
## `summarise()` has grouped output by 'medication_type'. You can override using
## the `.groups` argument.
kable(plot_data)
medication_type first_condition mu se y
Blood pressure scenario Saw SEs first 12.446429 1.990875 1
Blood pressure scenario Saw SDs first 8.616438 1.634231 2
COVID-19 scenario Saw SEs first 9.210526 2.102889 1
COVID-19 scenario Saw SDs first 6.108434 1.418882 2
ggplot(plot_data, aes(x = first_condition, y = mu)) +
  geom_pointrange(aes(ymin = mu - se, ymax = mu + se)) +
  geom_hline(yintercept = 0, linetype = 'dashed') +
  facet_wrap(~ medication_type) +
  labs(x = 'First visualization seen',
       y = 'Drop in estimated probability of superiority\nwhen seeing SDs vs. SEs',
       title = 'Average drop in probability of superiority estimate when seeing SDs vs. SEs',
       subtitle = 'Bars show one standard error')

df_recalled_correct %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd) %>%
  group_by(medication_type) %>% 
  do(tidy(t.test(x = .$diff_superiority_se_vs_sd,
                 mu = 0,
                 conf.level = 0.95,
                 type = "two.sample",
                 alternative = c("greater")))) %>%
  kable()
medication_type estimate statistic p.value parameter conf.low conf.high method alternative
Blood pressure scenario 10.279070 8.081967 0 128 8.171807 Inf One Sample t-test greater
COVID-19 scenario 7.371429 6.129715 0 139 5.380101 Inf One Sample t-test greater

6.5 Distribution builder estimates

distbuilder_values_by_worker_turkers <- df.turkers %>% 
  select(worker_id, medication_type, first_condition,
         superiority_estimate_sd, superiority_estimate_se,
         distbuilder_control_sd, distbuilder_treatment_sd,
         distbuilder_control_se, distbuilder_treatment_se) %>% 
  gather("distribution", "values", matches('distbuilder')) %>%
  mutate(values = str_remove(values, '\\['),
         values = str_remove(values, ']'),
         count = strsplit(as.character(values), ",")) %>%
  unnest(count) %>%
  group_by(worker_id, first_condition, medication_type,
           superiority_estimate_sd, superiority_estimate_se,
           distribution) %>%
  mutate(count = as.numeric(count),
         bin = row_number()) %>%
  left_join(bin_labels) %>%
  rename(bin_number = bin, bin = label) %>%
  mutate(bin = reorder(bin, bin_number))
## Joining with `by = join_by(medication_type, bin)`
plot_data <- merge(bin_labels, true_dists, all.x = T, all.y = T) %>%
  mutate(true_frac = pnorm(upper + 1/2, mean = mean, sd = sd) - pnorm(lower - 1/2, mean = mean, sd = sd))

# plot only first responses by first condition
plot_data_distbuilder_turkers <- distbuilder_values_by_worker_turkers %>%
  filter((grepl('SDs', first_condition) & grepl('sd$', distribution)) | (grepl('SEs', first_condition) & grepl('se$', distribution))) %>%
  filter(!is.na(bin)) %>%
  mutate(treatment_or_control = ifelse(grepl('treatment', distribution), 'treatment', 'control'),
         se_or_sd = ifelse(grepl('se$', distribution), 'When seeing SEs', 'When seeing SDs')) %>%
  group_by(distribution, treatment_or_control, first_condition, medication_type, bin, bin_number, center) %>%
  summarize(total_count = sum(count)) %>%
  group_by(medication_type, distribution) %>%
  mutate(frac = total_count / sum(total_count))
## `summarise()` has grouped output by 'distribution', 'treatment_or_control',
## 'first_condition', 'medication_type', 'bin', 'bin_number'. You can override
## using the `.groups` argument.
turkers_all_first_condition_distbuilder <- ggplot(plot_data_distbuilder_turkers, aes(x = center, y = frac)) +
    #geom_vline(data = true_dists,
    #           aes(xintercept = mean, color = distribution),
    #           linetype = 'dotted') +
    geom_segment(aes(color = treatment_or_control, xend=center), yend=0, stat = "identity", position = "identity", alpha = 0.5, size=3) +
    geom_line(data = plot_data, aes(x = center, y = true_frac, color = distribution), linetype = 'solid') +
    facet_grid(first_condition ~ medication_type, scale = "free_x") +
    labs(x = 'Bin',
         y = 'Percent of balls placed in each bin'
         #title = 'Distribution builder responses across all participants'#,
         #subtitle = 'Dotted lines show true means'
         ) +
    scale_y_continuous(label = scales::label_percent(accuracy = 1L)) +
    scale_color_manual(name="", labels=c("Placebo", "New medicine"), values=c("#7fbf7b", "#af8dc3")) +
    theme_minimal() +
    theme(legend.position = c(0.5, 0.9),
          legend.title = element_blank(),
          axis.text.x=element_text(angle=90, hjust=1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
turkers_all_first_condition_distbuilder

ggsave("figures/turkers_all_first_condition_distbuilder_responses.pdf", width=4, height=4)

6.5.1 Distbuilder estimates

Implied probability of superiorities

get_implied_auc_by_worker <- function(values_by_worker) {
  values_by_worker %>%
    #mutate(estimate = ifelse(first_condition == "Saw SDs first", superiority_estimate_sd, superiority_estimate_se)) %>%
    mutate(
      is_sd_dist = grepl("_sd$", distribution),
      is_treatment = grepl("_treatment_", distribution)
    ) %>%
    filter((first_condition == "Saw SDs first" & is_sd_dist) | (first_condition == "Saw SEs first" & !is_sd_dist)) %>%
    group_by(worker_id, first_condition, medication_type) %>%
    mutate(is_treatment = as.factor(is_treatment)) %>%
    do({
      data.frame(is_treatment = rep(.$is_treatment, .$count),
                 x = rep(.$center, .$count))
    }) %>%
    yardstick::roc_auc(is_treatment, x) %>%
    select(-.metric, -.estimator, auc = .estimate) %>%
    ungroup()
  
  
}
implied_auc_by_worker <- get_implied_auc_by_worker(distbuilder_values_by_worker_turkers)

model.implied.psup <- lm(auc ~ medication_type + first_condition, data = implied_auc_by_worker)
distbuilder.psup.est <- format(signif(-100*(tidy(model.implied.psup) %>% filter(term == "first_conditionSaw SDs first"))$estimate, 3))
distbuilder.psup.ttest <-  apa_print(model.implied.psup)$statistic$first_conditionSaw_SDs_first
summary(model.implied.psup)
## 
## Call:
## lm(formula = auc ~ medication_type + first_condition, data = implied_auc_by_worker)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66752 -0.13489  0.02203  0.12268  0.34987 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.66752    0.01247  53.519   <2e-16 ***
## medication_typeCOVID-19 scenario  0.01549    0.01475   1.050    0.294    
## first_conditionSaw SDs first     -0.02404    0.01476  -1.629    0.104    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1648 on 497 degrees of freedom
## Multiple R-squared:  0.00724,    Adjusted R-squared:  0.003245 
## F-statistic: 1.812 on 2 and 497 DF,  p-value: 0.1644

Implied estimates

implied_auc_by_turker <- get_implied_auc_by_worker(distbuilder_values_by_worker_turkers)
model.implied.psup.turker <- lm(auc ~ medication_type + first_condition, data = implied_auc_by_turker)
summary(model.implied.psup.turker)
## 
## Call:
## lm(formula = auc ~ medication_type + first_condition, data = implied_auc_by_turker)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66752 -0.13489  0.02203  0.12268  0.34987 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.66752    0.01247  53.519   <2e-16 ***
## medication_typeCOVID-19 scenario  0.01549    0.01475   1.050    0.294    
## first_conditionSaw SDs first     -0.02404    0.01476  -1.629    0.104    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1648 on 497 degrees of freedom
## Multiple R-squared:  0.00724,    Adjusted R-squared:  0.003245 
## F-statistic: 1.812 on 2 and 497 DF,  p-value: 0.1644
distbuilder.psup.est.turk <- format(signif(100*(tidy(model.implied.psup.turker) %>% filter(term == "first_conditionSaw SDs first"))$estimate, 3))
distbuilder.psup.ttest.turk <-  apa_print(model.implied.psup.turker)$statistic$first_conditionSaw_SDs_first
plot.implied.psups.turker <- plot_implied_auc(implied_auc_by_turker)
plot.implied.psups.turker

ggsave("figures/turkers_implied_distbuilder_psups.pdf", width=5, height=3)

6.6 Correlations between probability of superiority estimates and estimated value of treatment

6.6.1 Between-subjects on first estimate

df.turkers %>%
  group_by(medication_type) %>%
  filter(first_cost_estimate > quantile(first_cost_estimate, 0.05) &
         first_cost_estimate < quantile(first_cost_estimate, 0.95)) %>%
  summarize(spearman_cor = cor(first_superiority_estimate, first_cost_estimate, method = "spearman")) %>%
  kable()
medication_type spearman_cor
Blood pressure scenario 0.0191266
COVID-19 scenario -0.0074005
df.turkers %>%
  group_by(medication_type) %>%
  filter(first_cost_estimate > quantile(first_cost_estimate, 0.05) &
         first_cost_estimate < quantile(first_cost_estimate, 0.95)) %>%
  ggplot(aes(x = first_superiority_estimate, y = first_cost_estimate)) +
  geom_point(alpha = 0.10) +
  #geom_smooth() +
  facet_grid(medication_type ~ first_condition, scale = "free")

plot_data <- df.turkers %>%
  group_by(medication_type) %>%
  mutate(diff_superiority_se_vs_sd = superiority_estimate_se - superiority_estimate_sd,
         direction_superiority_se_vs_sd = case_when(diff_superiority_se_vs_sd > 0 ~ 'Increased',
                                                    diff_superiority_se_vs_sd == 0 ~ 'No change',
                                                    diff_superiority_se_vs_sd < 0 ~ 'Decreased'),
         diff_cost_se_vs_sd = cost_estimate_se - cost_estimate_sd,
         direction_cost_se_vs_sd = case_when(diff_cost_se_vs_sd > 0 ~ 'Increased',
                                             diff_cost_se_vs_sd == 0 ~ 'No change',
                                             diff_cost_se_vs_sd < 0 ~ 'Decreased')) %>%
  count(direction_superiority_se_vs_sd, direction_cost_se_vs_sd) %>%
  mutate(frac = n / sum(n))

plot_data %>%
  kable()
medication_type direction_superiority_se_vs_sd direction_cost_se_vs_sd n frac
Blood pressure scenario Decreased Decreased 20 0.0796813
Blood pressure scenario Decreased Increased 17 0.0677291
Blood pressure scenario Decreased No change 10 0.0398406
Blood pressure scenario Increased Decreased 16 0.0637450
Blood pressure scenario Increased Increased 93 0.3705179
Blood pressure scenario Increased No change 43 0.1713147
Blood pressure scenario No change Decreased 10 0.0398406
Blood pressure scenario No change Increased 9 0.0358566
Blood pressure scenario No change No change 33 0.1314741
COVID-19 scenario Decreased Decreased 20 0.0803213
COVID-19 scenario Decreased Increased 19 0.0763052
COVID-19 scenario Decreased No change 13 0.0522088
COVID-19 scenario Increased Decreased 21 0.0843373
COVID-19 scenario Increased Increased 85 0.3413655
COVID-19 scenario Increased No change 32 0.1285141
COVID-19 scenario No change Decreased 5 0.0200803
COVID-19 scenario No change Increased 9 0.0361446
COVID-19 scenario No change No change 45 0.1807229
plot_data %>%
  ungroup() %>%
  mutate(direction_superiority_se_vs_sd = fct_relevel(direction_superiority_se_vs_sd, c("Decreased", "No change", "Increased")),
         direction_cost_se_vs_sd = fct_relevel(direction_cost_se_vs_sd, c("Decreased", "No change", "Increased"))) %>%
  ggplot(aes(x = direction_superiority_se_vs_sd, y = direction_cost_se_vs_sd)) +
  geom_tile(aes(fill = frac, color = frac)) +
  geom_text(aes(label = sprintf('%0.0f%%', 100*frac))) +
  facet_wrap(~ medication_type) +
  scale_color_gradient(low = "white",
                      high = "red") +
  scale_fill_gradient(low = "white",
                      high = "red") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    plot.background = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    axis.line = element_blank(),
    axis.ticks = element_blank()
    #axis.text.x = element_blank(),
    #axis.text.y = element_blank()
    #axis.title.x = element_blank(),
    #axis.title.y = element_blank()
  ) +
  labs(title = 'Directional change in superiority and cost estimates',
       subtitle = 'Sum of the nine percentages in each panel is 100%',
       x = 'Change in superiority estimate from SDs to SEs',
       y = 'Change in cost estimate from SDs to SEs')

7 Free response feedback

df.turkers %>%
  filter(feedback != "") %>%
  select(medication_type, first_condition, superiority_estimate_se, superiority_estimate_sd, feedback) %>%
  datatable(
    caption = "Feedback",
    filter = "top",
    rownames = FALSE,
    options = list(autoWidth = TRUE)
  )

8 R Session information with package versions

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-apple-darwin22.4.0 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /usr/local/Cellar/openblas/0.3.23/lib/libopenblasp-r0.3.23.dylib 
## LAPACK: /usr/local/Cellar/r/4.3.0_1/lib/R/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lmtest_0.9-40     zoo_1.8-12        sandwich_3.0-2    ggeffects_1.2.2  
##  [5] forcats_1.0.0     stringr_1.5.0     dplyr_1.1.2       purrr_1.0.1      
##  [9] readr_2.1.4       tidyr_1.3.0       tibble_3.2.1      tidyverse_2.0.0  
## [13] papaja_0.1.1      tinylabels_0.2.3  patchwork_1.1.2   yardstick_1.2.0  
## [17] DescTools_0.99.49 here_1.0.1        ggbeeswarm_0.7.2  ggplot2_3.4.2    
## [21] coin_1.4-2        survival_3.5-5    boot_1.3-28.1     effectsize_0.8.3 
## [25] broom_1.0.4       scales_1.2.1      knitr_1.42        lubridate_1.9.2  
## [29] ggridges_0.5.4    digest_0.6.31     DT_0.28          
## 
## loaded via a namespace (and not attached):
##  [1] gld_2.6.6          readxl_1.4.2       rlang_1.1.1        magrittr_2.0.3    
##  [5] multcomp_1.4-23    matrixStats_0.63.0 e1071_1.7-13       compiler_4.3.0    
##  [9] systemfonts_1.0.4  vctrs_0.6.2        crayon_1.5.2       pkgconfig_2.0.3   
## [13] fastmap_1.1.1      ellipsis_0.3.2     backports_1.4.1    labeling_0.4.2    
## [17] utf8_1.2.3         rmarkdown_2.21     tzdb_0.4.0         ragg_1.2.5        
## [21] bit_4.0.5          xfun_0.39          modeltools_0.2-23  cachem_1.0.8      
## [25] jsonlite_1.8.4     highr_0.10         parallel_4.3.0     R6_2.5.1          
## [29] bslib_0.4.2        stringi_1.7.12     jquerylib_0.1.4    cellranger_1.1.0  
## [33] estimability_1.4.1 Rcpp_1.0.10        parameters_0.21.0  Matrix_1.5-4      
## [37] splines_4.3.0      timechange_0.2.0   tidyselect_1.2.0   rstudioapi_0.14   
## [41] yaml_2.3.7         codetools_0.2-19   lattice_0.21-8     withr_2.5.0       
## [45] bayestestR_0.13.1  coda_0.19-4        evaluate_0.21      proxy_0.4-27      
## [49] pillar_1.9.0       stats4_4.3.0       insight_0.19.1     generics_0.1.3    
## [53] vroom_1.6.3        rprojroot_2.0.3    hms_1.1.3          munsell_0.5.0     
## [57] rootSolve_1.8.2.3  class_7.3-21       glue_1.6.2         emmeans_1.8.6     
## [61] lmom_2.9           tools_4.3.0        data.table_1.14.8  Exact_3.2         
## [65] mvtnorm_1.1-3      grid_4.3.0         crosstalk_1.2.0    libcoin_1.0-9     
## [69] datawizard_0.7.1   colorspace_2.1-0   beeswarm_0.4.0     vipor_0.4.5       
## [73] cli_3.6.1          textshaping_0.3.6  fansi_1.0.4        expm_0.999-7      
## [77] gtable_0.3.3       sass_0.4.6         TH.data_1.1-2      farver_2.1.1      
## [81] htmlwidgets_1.6.2  htmltools_0.5.5    lifecycle_1.0.3    httr_1.4.6        
## [85] bit64_4.0.5        MASS_7.3-58.4